{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Il est temps de trancher et de découper"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce *notebook*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets evaluate transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget \"https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip\"\n",
    "!unzip drugsCom_raw.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "data_files = {\"train\": \"drugsComTrain_raw.tsv\", \"test\": \"drugsComTest_raw.tsv\"}\n",
    "# \\t est le caractère de tabulation en Python\n",
    "drug_dataset = load_dataset(\"csv\", data_files=data_files, delimiter=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_sample = drug_dataset[\"train\"].shuffle(seed=42).select(range(1000))\n",
    "# Un coup d'œil sur les premiers exemples\n",
    "drug_sample[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for split in drug_dataset.keys():\n",
    "    assert len(drug_dataset[split]) == len(drug_dataset[split].unique(\"Unnamed: 0\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset = drug_dataset.rename_column(\n",
    "    original_column_name=\"Unnamed: 0\", new_column_name=\"patient_id\"\n",
    ")\n",
    "drug_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lowercase_condition(example):\n",
    "    return {\"condition\": example[\"condition\"].lower()}\n",
    "\n",
    "\n",
    "drug_dataset.map(lowercase_condition)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_nones(x):\n",
    "    return x[\"condition\"] is not None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(lambda x: x * x)(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(lambda base, height: 0.5 * base * height)(4, 8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset = drug_dataset.filter(lambda x: x[\"condition\"] is not None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset = drug_dataset.map(lowercase_condition)\n",
    "# Vérification que la mise en minuscule a fonctionné\n",
    "drug_dataset[\"train\"][\"condition\"][:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_review_length(example):\n",
    "    return {\"review_length\": len(example[\"review\"].split())}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset = drug_dataset.map(compute_review_length)\n",
    "# Inspecter le premier exemple d'entraînement\n",
    "drug_dataset[\"train\"][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset[\"train\"].sort(\"review_length\")[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset = drug_dataset.filter(lambda x: x[\"review_length\"] > 30)\n",
    "print(drug_dataset.num_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import html\n",
    "\n",
    "text = \"I&#039;m a transformer called BERT\"\n",
    "html.unescape(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset = drug_dataset.map(lambda x: {\"review\": html.unescape(x[\"review\"])})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_drug_dataset = drug_dataset.map(\n",
    "    lambda x: {\"review\": [html.unescape(o) for o in x[\"review\"]]}, batched=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
    "\n",
    "\n",
    "def tokenize_function(examples):\n",
    "    return tokenizer(examples[\"review\"], truncation=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "slow_tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\", use_fast=False)\n",
    "\n",
    "\n",
    "def slow_tokenize_function(examples):\n",
    "    return slow_tokenizer(examples[\"review\"], truncation=True)\n",
    "\n",
    "\n",
    "tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_and_split(examples):\n",
    "    return tokenizer(\n",
    "        examples[\"review\"],\n",
    "        truncation=True,\n",
    "        max_length=128,\n",
    "        return_overflowing_tokens=True,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = tokenize_and_split(drug_dataset[\"train\"][0])\n",
    "[len(inp) for inp in result[\"input_ids\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_dataset = drug_dataset.map(\n",
    "    tokenize_and_split, batched=True, remove_columns=drug_dataset[\"train\"].column_names\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(tokenized_dataset[\"train\"]), len(drug_dataset[\"train\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_and_split(examples):\n",
    "    result = tokenizer(\n",
    "        examples[\"review\"],\n",
    "        truncation=True,\n",
    "        max_length=128,\n",
    "        return_overflowing_tokens=True,\n",
    "    )\n",
    "    # Extraire la correspondance entre les nouveaux et les anciens indices\n",
    "    sample_map = result.pop(\"overflow_to_sample_mapping\")\n",
    "    for key, values in examples.items():\n",
    "        result[key] = [values[i] for i in sample_map]\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)\n",
    "tokenized_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset.set_format(\"pandas\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset[\"train\"][:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = drug_dataset[\"train\"][:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frequencies = (\n",
    "    train_df[\"condition\"]\n",
    "    .value_counts()\n",
    "    .to_frame()\n",
    "    .reset_index()\n",
    "    .rename(columns={\"index\": \"condition\", \"condition\": \"frequency\"})\n",
    ")\n",
    "frequencies.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "freq_dataset = Dataset.from_pandas(frequencies)\n",
    "freq_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset.reset_format()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset_clean = drug_dataset[\"train\"].train_test_split(train_size=0.8, seed=42)\n",
    "# Renommer la division par défaut \"test\" en \"validation\"\n",
    "drug_dataset_clean[\"validation\"] = drug_dataset_clean.pop(\"test\")\n",
    "# Ajoutez le jeu \"test\" à notre `DatasetDict`\n",
    "drug_dataset_clean[\"test\"] = drug_dataset[\"test\"]\n",
    "drug_dataset_clean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dataset_clean.save_to_disk(\"drug-reviews\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_from_disk\n",
    "\n",
    "drug_dataset_reloaded = load_from_disk(\"drug-reviews\")\n",
    "drug_dataset_reloaded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for split, dataset in drug_dataset_clean.items():\n",
    "    dataset.to_json(f\"drug-reviews-{split}.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -n 1 drug-reviews-train.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_files = {\n",
    "    \"train\": \"drug-reviews-train.jsonl\",\n",
    "    \"validation\": \"drug-reviews-validation.jsonl\",\n",
    "    \"test\": \"drug-reviews-test.jsonl\",\n",
    "}\n",
    "drug_dataset_reloaded = load_dataset(\"json\", data_files=data_files)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Il est temps de trancher et de découper",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
